
/****************************************************************************************************
Do file name: 2_ConcursoGradesBuild.do
Author: Thiago Scot
This version: April 16th, 2019
DEscription: Cleans the Concurso database
****************************************************************************************************/
clear all

use "$data_input/data_examinations.dta"

*Cleaning names string
gen fullname = upper(Name)
replace fullname = subinstr(fullname,"  "," ",.)
replace fullname = subinstr(fullname," (SUB JUDICE)","",.)
replace fullname = subinstr(fullname,"(SUB JUDICE)","",.)
replace fullname = subinstr(fullname,"(SUBJUDICE)","",.)
replace fullname = subinstr(fullname,"- JUIZ SUBSTITUTO","",.)
replace fullname = subinstr(fullname,"- JUIZA SUBSTITUTA","",.)
replace fullname = subinstr(fullname," (SUB JUDICE - DEFICIENTE)","",.)
replace fullname = subinstr(fullname," (NEGRO)","",.)
replace fullname = subinstr(fullname,"(N)","",.)
replace fullname = trim(fullname)

*Cleaning some wrong grades from python extraction
replace Grade_Titles = trim(Grade_Titles)
replace Grade_Titles = "2.50" if Grade_Titles == "2 2.50" 			//correcting a mistake in imputation from Python
replace Grade_Titles = "2.00" if fullname == "RODRIGO COSTA DE LIMA FURTADO" 
replace Oral_Exam = "7.74" if Oral_Exam == "7. 74 "					//mnually correcting entry mistake

foreach var of varlist Rank Final_grade Grade_Titles Essay_Exam Objective_Exam Oral_Exam Written_Exam Written_Exam1 Written_Exam2 Essay_Exam_Civil Essay_Exam_Penal Objective_Questions Average_Written Specific_Exams{
	replace `var' = trim(`var')
	destring `var', replace
}

*Some concursos only provide final essay grades, while most separate in Civil and Penal codes. Create average
gen Essay_Exam_imp = Essay_Exam
egen avg_Essay_Exam = rowmean(Essay_Exam_Civil Essay_Exam_Penal)
replace Essay_Exam_imp = avg_Essay_Exam if Essay_Exam_imp == .
drop avg_Essay_Exam

*For TJSP_13, we have no final score, so construct it from grades+weights
replace Final_grade = 0.1*Objective_Exam + 0.3*Written_Exam + 0.3*Essay_Exam_imp + .2*Oral_Exam + 0.1*Grade_Titles  if Concurso == "TJSP_13"

*For TJPA_09, two separate grades for written, take average
replace Written_Exam = (Written_Exam1 + Written_Exam2)/2 if Concurso == "TJPA_09" 

*For TJTO_07, two separate grades for written, take average
replace Written_Exam = (Written_Exam1 + Written_Exam2)/2 if Concurso == "TJTO_07" 

*For TJPR_09, grades out of 100, transform in out of 10
foreach var of varlist Objective_Exam Oral_Exam Written_Exam {
	replace `var' = `var'/10 if Concurso == "TJPR_09" 
}

duplicates drop Concurso Name Final_grade Rank, force

*Generating Rankings
/*
Step 1: Sort by grade within concurso, then create rank
Step 2: For concursos without final grade available, replace rank with rank collected directly from concurso pdfs
*/

gsort Concurso -Final_grade
by Concurso: gen Rank_imp = _n

replace Rank_imp = Rank if Final_grade == .

duplicates drop Number fullname Concurso, force

order fullname Concurso Year Rank Rank_imp Final_grade Grade_Titles Essay_Exam-Specific_Exams Name Number 

save "$temp/judges_merged.dta", replace


**MERGING WITH CONCURSO METADATA
clear all 
set more off
set seed 1234

import excel "$data_input/ConcursosJuizFederal.xlsx", ///
sheet("Sheet1") firstrow clear

rename Codigo Concurso

save "$temp/concursos.dta", replace

*** Join judges + concurso datasets and save for analysis ****
use  "$temp/judges_merged.dta", clear

merge m:1 Concurso using "$temp/concursos.dta"

drop if _merge == 2

sort fullname

egen Judge_ID = group(fullname)

save  "$temp/judges_merged_concurso.dta", replace
